***********************************************
* Title: rwanda_jde_table8.do
* Author: Todd Pugatch
* Last update: June 4 2024
* Description: analysis for Blimpo and Pugatch, "Entrepreneurship Education
*	and Teacher Training in Rwanda," Stage 2 Registered report, Journal of 
*	Development Economics
* Inputs: 	student_merge_jde.dta
*			teacher_merge_jde.dta
* Outputs: 	rwanda_jde_table8.[txt/out]
* Notes: produces Table 8 
************************************************

* Set environment
local start=`"$S_TIME"'
clear
clear matrix
clear mata
graph drop _all
program drop _all
cap log close
set more off

* Set directories 
*global main "[SET MAIN DIRECTORY HERE]"
	global rawdata "$main/01_data/01_raw"
	global cleandata "$main/01_data/02_clean"
	global dofiles "$main/02_dofiles"
	global results "$main/03_results"
	global temp "$main/04_output"


* begin log file
log using "$results/rwanda_jde_table8.txt", text replace

****************************
* SET UP MULTIPLE HYPOTHESIS CORRECTION
/*Proceed in 3 steps:
	1. Run all regressions for H1. Save coefficients & std. errors.
	2. Calculate q-values for all hypotheses.
	3. Re-run regressions to get output, including q-values.*/
****************************

* 1. GET COEF/SE/P FOR ALL REGRESSIONS	
* PREPARE DATA
qui use "$cleandata/student_merge_jde.dta", clear

* winsorize all financial variables used in analysis
foreach w in el bl {
	winsor earn_last2mths_usd_`w', gen(earn_last2mths_usdw_`w') p(.01) highonly
	lab var earn_last2mths_usdw_`w' "earn_last2mths_usd_`w', winsorized at 99th percentile"
}

* prep missing values for students without baseline outcome
local Y0 "ownbusiness_bl earn_money_bl earn_last2mths_usdw_bl"
foreach y0 in `Y0' {
	qui gen `y0'i=`y0'
	qui replace `y0'i=r(mean) if `y0'==.	
	qui su `y0' if treatment==0
	qui gen `y0'm=(`y0'==.)
	lab var `y0'i "`y0', imputing control mean for missing values"
	lab var `y0'm "missing value for `y0'"
}

* generate interaction terms between treatment and baseline characteristics
/*standardize baseline exam score*/
qui egen zS3_exam_bl=std(S3_exam_bl)
lab var zS3_exam_bl "S3_exam_bl, z-score"

/*indicator for above median SES*/
qui su ses_pc1_bl, d
qui gen ses_abvmed_bl=(ses_pc1_bl>r(p50) & ses_pc1_bl!=.)
lab var ses_abvmed_bl "SES above median"
qui save "$temp/hettemp.dta", replace

/*teacher characteristics:
	Merge student to teacher characteristics. 
		Use baseline teacher characteristics, consistent with JDE Stage 1 plan (Table P7).
		To get this, merge student and teacher data by teacherid, iff there is a match.
		Note that this is different from rwanda_jderrs2_het1.do, which matched by school.*/
qui gen teacherid=teacherid_bl if same_teacher_bl==1
		
* get relevant teacher baseline characteristics from teacher survey
local X "female_bl experience_bl qualified_bl"
*drop _merge
qui merge m:1 school_code teacherid using "$cleandata/teacher_merge_jde.dta", keepusing(`X')
qui drop if _merge==2
drop _merge
ren female_bl female_teacher_bl

/*interact baseline characteristics w/ treatment.*/
local X "female zS3_exam_bl ses_abvmed_bl female_teacher_bl experience_bl qualified_bl"
foreach x in `X' {
	qui gen D_`x'=treatment*`x'
	lab var D_`x' "treatment * `x'"
}
qui save "$temp/hettemp.dta", replace

* ANALYSIS
/*Goal: produce Table P7 (Panels A-C, columns 1-6 in each panel)
	Uses endline student survey.
	Regress outcome on treatment status and treatment interacted with baseline characteristic. 
	Control for baseline outcome, main effect of baseline characteristic, and randomization strata. 
	Cluster s.e.'s by school.
	Test sum of treatment and interaction term.
	In this Step 1, get p-values from treatment effect and interaction term.*/
local X "female zS3_exam_bl ses_abvmed_bl female_teacher_bl experience_bl qualified_bl"

/*Panel A: outcome = business involvement*/	
local y0 "ownbusiness_bli ownbusiness_blm"
local i=1 /*initialize loop*/
foreach x in `X' {
	qui areg personal_business_el treatment D_`x' `x' `y0', a(strata) cluster(school_code)
	/*get p-value of treatment effect*/
	scalar define T7_panA_c`i'bt=_b[treatment]
	scalar define T7_panA_c`i'set=_se[treatment]
	scalar define t=_b[treatment]/_se[treatment]
	scalar define T7_panA_c`i'pt=2*ttail(e(df_r),abs(t))
	/*get p-value of interaction term*/
	scalar define T7_panA_c`i'bi=_b[D_`x']
	scalar define T7_panA_c`i'sei=_se[D_`x']
	scalar define t=_b[D_`x']/_se[D_`x']
	scalar define T7_panA_c`i'pi=2*ttail(e(df_r),abs(t))
	local i=`i'+1
}

/*Panel B: outcome = employment*/	
local y0 "earn_money_bli earn_money_blm"
local i=1 /*initialize loop*/
foreach x in `X' {
	qui areg employed_el treatment D_`x' `x' `y0', a(strata) cluster(school_code)
	/*get p-value of treatment effect*/
	scalar define T7_panB_c`i'bt=_b[treatment]
	scalar define T7_panB_c`i'set=_se[treatment]
	scalar define t=_b[treatment]/_se[treatment]
	scalar define T7_panB_c`i'pt=2*ttail(e(df_r),abs(t))
	/*get p-value of interaction term*/
	scalar define T7_panB_c`i'bi=_b[D_`x']
	scalar define T7_panB_c`i'sei=_se[D_`x']
	scalar define t=_b[D_`x']/_se[D_`x']
	scalar define T7_panB_c`i'pi=2*ttail(e(df_r),abs(t))
	local i=`i'+1
}

/*Panel C: outcome = income*/	
local y0 "earn_last2mths_usdw_bli earn_last2mths_usdw_blm"
local i=1 /*initialize loop*/
foreach x in `X' {
	qui areg earn_last2mths_usdw_el treatment D_`x' `x' `y0', a(strata) cluster(school_code)
	/*get p-value of treatment effect*/
	scalar define T7_panC_c`i'bt=_b[treatment]
	scalar define T7_panC_c`i'set=_se[treatment]
	scalar define t=_b[treatment]/_se[treatment]
	scalar define T7_panC_c`i'pt=2*ttail(e(df_r),abs(t))
	/*get p-value of interaction term*/
	scalar define T7_panC_c`i'bi=_b[D_`x']
	scalar define T7_panC_c`i'sei=_se[D_`x']
	scalar define t=_b[D_`x']/_se[D_`x']
	scalar define T7_panC_c`i'pi=2*ttail(e(df_r),abs(t))
	local i=`i'+1
}	

* 2. CALCULATE Q-VALUES FOR ALL HYPOTHESES
/*Calculate sharpened q-values from Benjamini, Krieger, Yuketeli (2006).
	Use code from fdr_sharpened_qvalues.do from Anderson (2008), available at 
		https://are.berkeley.edu/~mlanderson/ARE_Website/Research.html*/
* prep data: put p-values from above into a dataset
clear
local I=`i'-1 	/*#cols in table*/
local J=3 		/*#panels in table*/
local n=`I'*`J'*2	/*# of hypotheses tested (x2 because treatment effect & interaction for each outcome*/
qui set obs `n'
qui gen table=7
qui egen panel=seq(), from(1) to(`J')
qui bysort panel: egen column=seq(), from(1) to(`I')
qui bysort panel column: egen interaction=seq(), from(0) to(1)
foreach x in b se pval {
	qui gen double `x'=.
}
forval j=1/`J' { /*loop through panels*/
	if (`j'==1) local jj "A"
	if (`j'==2) local jj "B"
	if (`j'==3) local jj "C"
	forval i=1/`I' {
		qui replace b=T7_pan`jj'_c`i'bt if panel==`j' & column==`i' & interaction==0
		qui replace se=T7_pan`jj'_c`i'set if panel==`j' & column==`i' & interaction==0
		qui replace pval=T7_pan`jj'_c`i'pt if panel==`j' & column==`i' & interaction==0 
		qui replace b=T7_pan`jj'_c`i'bi if panel==`j' & column==`i' & interaction==1
		qui replace se=T7_pan`jj'_c`i'sei if panel==`j' & column==`i' & interaction==1
		qui replace pval=T7_pan`jj'_c`i'pi if panel==`j' & column==`i' & interaction==1 
	}
}
sort table panel column interaction

* START OF ANDERSON CODE FROM fdr_sharpened_qvalues.do (modified)
* Collect the total number of p-values tested
quietly sum pval
local totalpvals = r(N)

* Sort the p-values in ascending order and generate a variable that codes each p-value's rank

quietly gen int original_sorting_order = _n
quietly sort pval
quietly gen int rank = _n if pval~=.

* Generate the variable that will contain the BKY (2006) sharpened q-values

qui gen bky06_qval = 1 if pval~=.
	
* Set the initial counter to 1 

local qval = 1

/* Set up a loop that begins by checking which hypotheses are rejected at q = 1.000, 
	then checks which hypotheses are rejected at q = 0.999, 
	then checks which hypotheses are rejected at q = 0.998, etc.  
	The loop ends by checking which hypotheses are rejected at q = 0.001. */
while `qval' > 0 {
	* First Stage
	* Generate the adjusted first stage q level we are testing: q' = q/1+q
	local qval_adj = `qval'/(1+`qval')
	
	* Generate value q'*r/M
	qui gen fdr_temp1 = `qval_adj'*rank/`totalpvals'
	
	* Generate binary variable checking condition p(r) <= q'*r/M
	qui gen reject_temp1 = (fdr_temp1>=pval) if pval~=.
	
	* Generate variable containing p-value ranks for all p-values that meet above condition
	qui gen reject_rank1 = reject_temp1*rank
	
	* Record the rank of the largest p-value that meets above condition
	qui egen total_rejected1 = max(reject_rank1)

	* Second Stage
	* Generate the second stage q level that accounts for hypotheses rejected in first stage: q_2st = q'*(M/m0)
	local qval_2st = `qval_adj'*(`totalpvals'/(`totalpvals'-total_rejected1[1]))
	
	* Generate value q_2st*r/M
	qui gen fdr_temp2 = `qval_2st'*rank/`totalpvals'
	
	* Generate binary variable checking condition p(r) <= q_2st*r/M
	qui gen reject_temp2 = (fdr_temp2>=pval) if pval~=.
	
	* Generate variable containing p-value ranks for all p-values that meet above condition
	qui gen reject_rank2 = reject_temp2*rank
	
	* Record the rank of the largest p-value that meets above condition
	qui egen total_rejected2 = max(reject_rank2)

	* A p-value has been rejected at level q if its rank is less than or equal to the rank of the max p-value that meets the above condition
	qui replace bky06_qval = `qval' if rank <= total_rejected2 & rank~=.
	
	* Reduce q by 0.001 and repeat loop
	drop fdr_temp* reject_temp* reject_rank* total_rejected*
	local qval = `qval' - .001
}

quietly sort original_sorting_order

******************************
* END OF ANDERSON CODE FROM fdr_sharpened_qvalues.do (modified)
******************************
* list results
list table panel column interaction b se pval bky06_qval

* prep results for inclusion in final regression output
sort table panel column interaction
local k=1
forval j=1/`J' { /*loop through panels*/
	if (`j'==1) local jj "A"
	if (`j'==2) local jj "B"
	if (`j'==3) local jj "C"
	forval i=1/`I' {
		scalar define T7_pan`jj'_c`i'qt=bky06_qval in `k'
		local k=`k'+1
		scalar define T7_pan`jj'_c`i'qi=bky06_qval in `k'
		local k=`k'+1
	}
}

* 3. RE-RUN REGRESSIONS TO GET OUTPUT, INCLUDING Q-VALUES
qui use "$temp/hettemp.dta", clear

* ANALYSIS
/*Goal: produce Table P7 (Panels A-C, columns 1-6 in each panel)
	Uses endline student survey.
	Regress outcome on treatment status and treatment interacted with baseline characteristic. 
	Control for baseline outcome, main effect of baseline characteristic, and randomization strata. 
	Cluster s.e.'s by school.
	Test sum of treatment and interaction term.*/
local stat ""control mean",mu_c,"treatment + interaction",coef_sum,"se(treatment + interaction)",se_sum,"interaction mean",mu_x,"q-value (treatment)",qvt,"q-value (interaction)",qvi"
local X "zS3_exam_bl ses_abvmed_bl female_teacher_bl experience_bl qualified_bl"

/*Panel A: outcome = business involvement*/	
local y0 "ownbusiness_bli ownbusiness_blm"
local i=1

local first_var: word 1 of `X'

* Initialize the file with the first variable -- female
qui areg personal_business_el treatment D_female female `y0', a(strata) cluster(school_code)
qui lincom treatment+D_female
scalar define coef_sum=r(estimate)
scalar define se_sum=r(se)
qui su female if e(sample)
scalar define mu_x=r(mean)
qui su personal_business_el if treatment==0 & e(sample)
scalar define mu_c=r(mean)
scalar define qvt=T7_panA_c1qt
scalar define qvi=T7_panA_c1qi
outreg2 treatment D_female using "$results/rwanda_jde_table8.xls", se excel nolabel nocons addstat(`stat') replace
local i=`i'+1

foreach x in `X' {
	qui areg personal_business_el treatment D_`x' `x' `y0', a(strata) cluster(school_code)
	qui lincom treatment+D_`x'
	scalar define coef_sum=r(estimate)
	scalar define se_sum=r(se)
	qui su `x' if e(sample)
	scalar define mu_x=r(mean)
	qui su personal_business_el if treatment==0 & e(sample)
	scalar define mu_c=r(mean)
	scalar define qvt=T7_panA_c`i'qt
	scalar define qvi=T7_panA_c`i'qi	
	outreg2 treatment D_`x' using "$results/rwanda_jde_table8.xls", se excel nolabel nocons addstat(`stat') append
	local i=`i'+1
}

/*Panel B: outcome = employment*/	
local y0 "earn_money_bli earn_money_blm"
local i=1
	
	* female
	qui areg employed_el treatment D_female female `y0', a(strata) cluster(school_code)
	qui lincom treatment+D_female
	scalar define coef_sum=r(estimate)
	scalar define se_sum=r(se)
	qui su female if e(sample)
	scalar define mu_x=r(mean)
	qui su employed_el if treatment==0 & e(sample)
	scalar define mu_c=r(mean)
	scalar define qvt=T7_panB_c`i'qt
	scalar define qvi=T7_panB_c`i'qi	
	outreg2 treatment D_female using "$results/rwanda_jde_table8.xls", se excel nolabel nocons addstat(`stat') append
	local i=`i'+1
	
foreach x in `X' {
	qui areg employed_el treatment D_`x' `x' `y0', a(strata) cluster(school_code)
	qui lincom treatment+D_`x'
	scalar define coef_sum=r(estimate)
	scalar define se_sum=r(se)
	qui su `x' if e(sample)
	scalar define mu_x=r(mean)
	qui su employed_el if treatment==0 & e(sample)
	scalar define mu_c=r(mean)
	scalar define qvt=T7_panB_c`i'qt
	scalar define qvi=T7_panB_c`i'qi	
	outreg2 treatment D_`x' using "$results/rwanda_jde_table8.xls", se excel nolabel nocons addstat(`stat') append
	local i=`i'+1
}

/*Panel C: outcome = income*/	
local y0 "earn_last2mths_usdw_bli earn_last2mths_usdw_blm"
local i=1
	* female: 
	qui areg earn_last2mths_usdw_el treatment D_female female `y0', a(strata) cluster(school_code)
	qui lincom treatment+D_female
	scalar define coef_sum=r(estimate)
	scalar define se_sum=r(se)
	qui su female if e(sample)
	scalar define mu_x=r(mean)
	qui su earn_last2mths_usdw_el if treatment==0 & e(sample)
	scalar define mu_c=r(mean)
	scalar define qvt=T7_panC_c`i'qt
	scalar define qvi=T7_panC_c`i'qi	
	outreg2 treatment D_female using "$results/rwanda_jde_table8.xls", se excel nolabel nocons addstat(`stat') append
	local i=`i'+1
	
foreach x in `X' {
	qui areg earn_last2mths_usdw_el treatment D_`x' `x' `y0', a(strata) cluster(school_code)
	qui lincom treatment+D_`x'
	scalar define coef_sum=r(estimate)
	scalar define se_sum=r(se)
	qui su `x' if e(sample)
	scalar define mu_x=r(mean)
	qui su earn_last2mths_usdw_el if treatment==0 & e(sample)
	scalar define mu_c=r(mean)
	scalar define qvt=T7_panC_c`i'qt
	scalar define qvi=T7_panC_c`i'qi	
	outreg2 treatment D_`x' using "$results/rwanda_jde_table8.xls", se excel nolabel nocons addstat(`stat') append
	local i=`i'+1
}	
	
erase "$temp/hettemp.dta"	
local end=`"$S_TIME"' 
di "`start'"
di "`end'"
log close
